Pandas Cheat Sheet
Contents
Pandas Cheat Sheet#
Run Jupter:
jupyter notebook
Build book:
jupyter-book build pandas_cheatsheet.ipynb
Setup Environment#
# ensures you are running the pip version associated with the current Python kernel
import sys
!{sys.executable} -m pip install requests
!{sys.executable} -m pip install scipy
!{sys.executable} -m pip install matplotlib
!{sys.executable} -m pip install scikit-learn
!{sys.executable} -m pip install plotly
Requirement already satisfied: requests in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (2.28.2)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from requests) (2022.12.7)
Requirement already satisfied: idna<4,>=2.5 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from requests) (3.4)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from requests) (1.26.14)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from requests) (3.0.1)
^C
Collecting scipy
Using cached scipy-1.10.0-cp310-cp310-win_amd64.whl (42.5 MB)
Requirement already satisfied: numpy<1.27.0,>=1.19.5 in c:\users\wsaye\pycharmprojects\wmsayer.github.io\venv\lib\site-packages (from scipy) (1.24.1)
Installing collected packages: scipy
import pandas as pd
import numpy as np
import json
import requests
import datetime as dt
from dateutil.relativedelta import relativedelta
import scipy
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.offline as pyo
# Set notebook mode to work in offline
pyo.init_notebook_mode()
Data & File I/O#
JSON Handling#
Read JSON File#
# load local JSON file and return as Python dict
def json_load(f_path):
f = open(f_path, )
json_dict = json.load(f)
f.close()
return json_dict
Flattening (JSON Normalize)#
json_norm_dict = [
{
"id": 1,
"name": "Cole Volk",
"fitness": {"height": 130, "weight": 60},
},
{"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
{
"id": 2,
"name": "Faye Raker",
"fitness": {"height": 130, "weight": 60},
},
]
pd.json_normalize(json_norm_dict, max_level=0)
| id | name | fitness | |
|---|---|---|---|
| 0 | 1.0 | Cole Volk | {'height': 130, 'weight': 60} |
| 1 | NaN | Mark Reg | {'height': 130, 'weight': 60} |
| 2 | 2.0 | Faye Raker | {'height': 130, 'weight': 60} |
pd.json_normalize(json_norm_dict, max_level=1)
| id | name | fitness.height | fitness.weight | |
|---|---|---|---|---|
| 0 | 1.0 | Cole Volk | 130 | 60 |
| 1 | NaN | Mark Reg | 130 | 60 |
| 2 | 2.0 | Faye Raker | 130 | 60 |
json_norm_dict = [
{
"state": "Florida",
"shortname": "FL",
"info": {"governor": "Rick Scott"},
"counties": [
{"name": "Dade", "population": 12345},
{"name": "Broward", "population": 40000},
{"name": "Palm Beach", "population": 60000},
],
},
{
"state": "Ohio",
"shortname": "OH",
"info": {"governor": "John Kasich"},
"counties": [
{"name": "Summit", "population": 1234},
{"name": "Cuyahoga", "population": 1337},
],
},
]
pd.json_normalize(json_norm_dict,
record_path="counties",
meta=["state", "shortname", ["info", "governor"]])
| name | population | state | shortname | info.governor | |
|---|---|---|---|---|---|
| 0 | Dade | 12345 | Florida | FL | Rick Scott |
| 1 | Broward | 40000 | Florida | FL | Rick Scott |
| 2 | Palm Beach | 60000 | Florida | FL | Rick Scott |
| 3 | Summit | 1234 | Ohio | OH | John Kasich |
| 4 | Cuyahoga | 1337 | Ohio | OH | John Kasich |
Load JSON via REST API#
# make GET request to REST API and return as Python dict
def run_json_get(url, params={}, headers={}, print_summ=True, print_resp=False):
if print_summ:
print("/"*69)
print("GET Address: %s\nHeaders %s:\nParameters %s:" % (url, repr(headers), repr(params)))
response = requests.get(url, params=params, headers=headers)
status_code = response.status_code
if status_code == 200:
json_dict = response.json()
else:
json_dict = {}
if print_summ:
print("Status Code: %d" % response.status_code)
# print("Message: %d" % response.messa)
if type(json_dict) == dict:
print("Response Keys: %s\n" % json_dict.keys())
if print_resp:
print("Response: %s\n" % json_dict)
return json_dict, status_code
Read SQL Query w/ Params#
def read_sql(f_path, params={}):
f = open(f_path, "r")
query = f.read()
f.close()
if params:
query = query.format(**params)
return query
I/O tests#
# read JSON
test_json_path = "C:/Users/wsaye/PycharmProjects/CashAppInterview/templates/data.json"
print(json_load(test_json_path))
{'api_key': 'fdghdfghfgfg'}
# read SQL (no params)
test_sql_path = "C:/Users/wsaye/PycharmProjects/CashAppInterview/templates/query.sql"
print(read_sql(test_sql_path))
SELECT *
FROM my_table
WHERE id = 1
# read SQL (w/ params)
test_sql_params_path = "C:/Users/wsaye/PycharmProjects/CashAppInterview/templates/query_w_params.sql"
test_params = {"my_id": 102393, "max_date": "2000/01/01"}
print(read_sql(test_sql_params_path, params=test_params))
SELECT *
FROM my_table
WHERE id = 102393 OR date = '2000/01/01'
Sample REST Dataset (CoinMetrics)#
def get_asset_metrics(assets, metrics, freq, alt_params={}, page_size=10000, print_summ=True):
# freq options 1b, 1s, 1m, 1h, 1d
# for 'start_time' and 'end_time', formats "2006-01-20T00:00:00Z" and "2006-01-20" are supported among others
# https://docs.coinmetrics.io/api/v4#operation/getTimeseriesAssetMetrics
# https://docs.coinmetrics.io/info/metrics
assets_str = ", ".join(assets)
metrics_str = ", ".join(metrics)
api_root = 'https://community-api.coinmetrics.io/v4'
data_key = "data"
url = "/".join([api_root, "timeseries/asset-metrics"])
params = {'assets': assets_str, 'metrics': metrics_str, 'frequency': freq,
'page_size': page_size}
params.update(alt_params)
result_dict, status_code = run_json_get(url, params=params, headers={}, print_summ=print_summ)
result_df = pd.DataFrame(result_dict[data_key])
result_df.sort_values(by=["asset", "time"], inplace=True)
result_df.reset_index(inplace=True, drop=True)
for m in metrics:
result_df[m] = result_df[m].astype(float)
return result_df
def load_asset_metric_data(pull_new):
if pull_new:
df = get_asset_metrics(test_assets, test_metrics, test_freq, print_summ=False)
df.to_csv(test_df_cache, index=False)
else:
df = pd.read_csv(test_df_cache)
return df
Get data w/ cache#
test_assets = ['btc', 'eth']
test_metrics = ['AdrActCnt', 'PriceUSD']
test_freq = '1d'
test_df_cache = "C:/Users/wsaye/PycharmProjects/CashAppInterview/data/cm_test_data.csv"
test_df = load_asset_metric_data(True)
test_df = test_df.dropna(subset=test_metrics).reset_index(drop=True)
test_df["datetime"] = pd.to_datetime(test_df["time"], utc=True) # str timestamp to datetime
test_df["dayname"] = test_df["datetime"].dt.day_name()
test_df["date"] = pd.to_datetime(test_df["time"], utc=True).dt.date # datetime to date
test_df
Pandas Options & Settings#
Docs and available options here
pd.get_option("display.max_rows")
pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", 10)
Summarize Dataframe#
General Info#
test_df.describe() # summary stats of columns
| AdrActCnt | PriceUSD | |
|---|---|---|
| count | 7.213000e+03 | 7213.000000 |
| mean | 4.299169e+05 | 5678.012019 |
| std | 3.559791e+05 | 12124.585363 |
| min | 4.080000e+02 | 0.050541 |
| 25% | 8.679800e+04 | 117.783464 |
| 50% | 4.062670e+05 | 467.321788 |
| 75% | 6.681570e+05 | 4335.593450 |
| max | 7.157228e+06 | 67541.755508 |
test_df.info() # dataframe schema info, column types
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7213 entries, 0 to 7212
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 asset 7213 non-null object
1 time 7213 non-null object
2 AdrActCnt 7213 non-null float64
3 PriceUSD 7213 non-null float64
dtypes: float64(2), object(2)
memory usage: 225.5+ KB
test_df.dtypes
asset object
time object
AdrActCnt float64
PriceUSD float64
dtype: object
test_df.head()
| asset | time | AdrActCnt | PriceUSD | |
|---|---|---|---|---|
| 0 | btc | 2010-07-18T00:00:00.000000000Z | 860.0 | 0.085840 |
| 1 | btc | 2010-07-19T00:00:00.000000000Z | 929.0 | 0.080800 |
| 2 | btc | 2010-07-20T00:00:00.000000000Z | 936.0 | 0.074736 |
| 3 | btc | 2010-07-21T00:00:00.000000000Z | 784.0 | 0.079193 |
| 4 | btc | 2010-07-22T00:00:00.000000000Z | 594.0 | 0.058470 |
print(type(test_df.loc[0, "time"])) # type of particular entry
<class 'str'>
test_df.nlargest(5, "PriceUSD")
# test_df.nsmallest(5, "PriceUSD")
| asset | time | AdrActCnt | PriceUSD | |
|---|---|---|---|---|
| 4131 | btc | 2021-11-08T00:00:00.000000000Z | 1018796.0 | 67541.755508 |
| 4132 | btc | 2021-11-09T00:00:00.000000000Z | 1195638.0 | 67095.585671 |
| 4112 | btc | 2021-10-20T00:00:00.000000000Z | 977215.0 | 66061.796564 |
| 4137 | btc | 2021-11-14T00:00:00.000000000Z | 806451.0 | 65032.225655 |
| 4134 | btc | 2021-11-11T00:00:00.000000000Z | 1037951.0 | 64962.931294 |
test_df["asset"].unique()
# test_df["asset"].nunique()
array(['btc', 'eth'], dtype=object)
Crosstab#
cross_df = test_df.loc[test_df["asset"]== "btc", ["datetime", "dayname", "PriceUSD"]].copy().dropna()
cross_df = cross_df.sort_values(by="datetime")
cross_df["7d_SMA"] = cross_df["PriceUSD"].rolling(7).mean()
cross_df["beating_SMA"] = cross_df["PriceUSD"] > cross_df["7d_SMA"]
cross_df["return"] = cross_df["PriceUSD"].pct_change()
cross_df.dropna(inplace=True)
cross_df
| datetime | dayname | PriceUSD | 7d_SMA | beating_SMA | return | |
|---|---|---|---|---|---|---|
| 6 | 2010-07-24 00:00:00+00:00 | Saturday | 0.054540 | 0.070596 | False | -0.099894 |
| 7 | 2010-07-25 00:00:00+00:00 | Sunday | 0.050541 | 0.065553 | False | -0.073329 |
| 8 | 2010-07-26 00:00:00+00:00 | Monday | 0.056000 | 0.062010 | False | 0.108020 |
| 9 | 2010-07-27 00:00:00+00:00 | Tuesday | 0.058622 | 0.059708 | False | 0.046822 |
| 10 | 2010-07-28 00:00:00+00:00 | Wednesday | 0.058911 | 0.056811 | True | 0.004931 |
| ... | ... | ... | ... | ... | ... | ... |
| 4525 | 2022-12-07 00:00:00+00:00 | Wednesday | 16848.251824 | 16991.106107 | False | -0.013020 |
| 4526 | 2022-12-08 00:00:00+00:00 | Thursday | 17231.456322 | 17029.701038 | True | 0.022744 |
| 4527 | 2022-12-09 00:00:00+00:00 | Friday | 17136.149001 | 17038.421018 | True | -0.005531 |
| 4528 | 2022-12-10 00:00:00+00:00 | Saturday | 17124.214680 | 17070.522061 | True | -0.000696 |
| 4529 | 2022-12-11 00:00:00+00:00 | Sunday | 17094.360659 | 17066.614300 | True | -0.001743 |
4524 rows × 6 columns
pd.crosstab(cross_df['beating_SMA'], cross_df['dayname'])
| dayname | Friday | Monday | Saturday | Sunday | Thursday | Tuesday | Wednesday |
|---|---|---|---|---|---|---|---|
| beating_SMA | |||||||
| False | 269 | 285 | 292 | 293 | 276 | 297 | 282 |
| True | 377 | 361 | 355 | 354 | 370 | 349 | 364 |
pd.crosstab(cross_df['beating_SMA'], cross_df['dayname'], normalize=True)
| dayname | Friday | Monday | Saturday | Sunday | Thursday | Tuesday | Wednesday |
|---|---|---|---|---|---|---|---|
| beating_SMA | |||||||
| False | 0.059461 | 0.062997 | 0.064545 | 0.064766 | 0.061008 | 0.065650 | 0.062334 |
| True | 0.083333 | 0.079797 | 0.078470 | 0.078249 | 0.081786 | 0.077144 | 0.080460 |
pd.crosstab(cross_df['beating_SMA'], cross_df['dayname'], values=cross_df['return'], aggfunc=np.mean)
| dayname | Friday | Monday | Saturday | Sunday | Thursday | Tuesday | Wednesday |
|---|---|---|---|---|---|---|---|
| beating_SMA | |||||||
| False | -0.018046 | -0.020816 | -0.012431 | -0.009124 | -0.020508 | -0.017438 | -0.01737 |
| True | 0.019836 | 0.025968 | 0.017233 | 0.008952 | 0.022115 | 0.025215 | 0.02249 |
Sort/ Rank#
sort_df = test_df[["date", "asset", "PriceUSD"]].copy()
sort_df['price_rank'] = sort_df["PriceUSD"].rank(ascending=True, pct=False)
sort_df['price_pct'] = sort_df["PriceUSD"].rank(ascending=True, pct=True)
sort_df
| date | asset | PriceUSD | price_rank | price_pct | |
|---|---|---|---|---|---|
| 0 | 2010-07-18 | btc | 0.085840 | 83.0 | 0.011507 |
| 1 | 2010-07-19 | btc | 0.080800 | 81.0 | 0.011230 |
| 2 | 2010-07-20 | btc | 0.074736 | 79.0 | 0.010952 |
| 3 | 2010-07-21 | btc | 0.079193 | 80.0 | 0.011091 |
| 4 | 2010-07-22 | btc | 0.058470 | 5.0 | 0.000693 |
| ... | ... | ... | ... | ... | ... |
| 7208 | 2022-12-07 | eth | 1232.552454 | 4488.0 | 0.622210 |
| 7209 | 2022-12-08 | eth | 1281.633052 | 4529.0 | 0.627894 |
| 7210 | 2022-12-09 | eth | 1263.208619 | 4516.0 | 0.626092 |
| 7211 | 2022-12-10 | eth | 1266.771442 | 4518.0 | 0.626369 |
| 7212 | 2022-12-11 | eth | 1263.069667 | 4515.0 | 0.625953 |
7213 rows × 5 columns
sort_df.sort_values(by="price_rank", ascending=False)
| date | asset | PriceUSD | price_rank | price_pct | |
|---|---|---|---|---|---|
| 4131 | 2021-11-08 | btc | 67541.755508 | 7213.0 | 1.000000 |
| 4132 | 2021-11-09 | btc | 67095.585671 | 7212.0 | 0.999861 |
| 4112 | 2021-10-20 | btc | 66061.796564 | 7211.0 | 0.999723 |
| 4137 | 2021-11-14 | btc | 65032.225655 | 7210.0 | 0.999584 |
| 4134 | 2021-11-11 | btc | 64962.931294 | 7209.0 | 0.999445 |
| ... | ... | ... | ... | ... | ... |
| 4 | 2010-07-22 | btc | 0.058470 | 5.0 | 0.000693 |
| 17 | 2010-08-04 | btc | 0.057016 | 4.0 | 0.000555 |
| 8 | 2010-07-26 | btc | 0.056000 | 3.0 | 0.000416 |
| 6 | 2010-07-24 | btc | 0.054540 | 2.0 | 0.000277 |
| 7 | 2010-07-25 | btc | 0.050541 | 1.0 | 0.000139 |
7213 rows × 5 columns
Cleaning#
Deleting Rows/Columns here
Replace#
replace_df = test_df[["date", "asset", "dayname"]].copy()
# replace_df.replace("Sunday", "Sun")
replace_df.replace({"Sunday": "S", "Monday": "M", "Tuesday": "T"})
| date | asset | dayname | |
|---|---|---|---|
| 0 | 2010-07-18 | btc | S |
| 1 | 2010-07-19 | btc | M |
| 2 | 2010-07-20 | btc | T |
| 3 | 2010-07-21 | btc | Wednesday |
| 4 | 2010-07-22 | btc | Thursday |
| ... | ... | ... | ... |
| 7208 | 2022-12-07 | eth | Wednesday |
| 7209 | 2022-12-08 | eth | Thursday |
| 7210 | 2022-12-09 | eth | Friday |
| 7211 | 2022-12-10 | eth | Saturday |
| 7212 | 2022-12-11 | eth | S |
7213 rows × 3 columns
Drop/Fill NA()#
cleaning_df = test_df[["date", "asset", "PriceUSD"]].pivot(index="date", columns="asset", values="PriceUSD")
cleaning_df
| asset | btc | eth |
|---|---|---|
| date | ||
| 2010-07-18 | 0.085840 | NaN |
| 2010-07-19 | 0.080800 | NaN |
| 2010-07-20 | 0.074736 | NaN |
| 2010-07-21 | 0.079193 | NaN |
| 2010-07-22 | 0.058470 | NaN |
| ... | ... | ... |
| 2022-12-07 | 16848.251824 | 1232.552454 |
| 2022-12-08 | 17231.456322 | 1281.633052 |
| 2022-12-09 | 17136.149001 | 1263.208619 |
| 2022-12-10 | 17124.214680 | 1266.771442 |
| 2022-12-11 | 17094.360659 | 1263.069667 |
4530 rows × 2 columns
# cleaning_df.dropna() # drops N/A looking in all columns
cleaning_df.dropna(subset=["eth"]) # drops N/A in subset only
| asset | btc | eth |
|---|---|---|
| date | ||
| 2015-08-08 | 261.450276 | 1.199990 |
| 2015-08-09 | 266.342020 | 1.199990 |
| 2015-08-10 | 264.928825 | 1.199990 |
| 2015-08-11 | 271.421736 | 0.990000 |
| 2015-08-12 | 268.143868 | 1.288000 |
| ... | ... | ... |
| 2022-12-07 | 16848.251824 | 1232.552454 |
| 2022-12-08 | 17231.456322 | 1281.633052 |
| 2022-12-09 | 17136.149001 | 1263.208619 |
| 2022-12-10 | 17124.214680 | 1266.771442 |
| 2022-12-11 | 17094.360659 | 1263.069667 |
2683 rows × 2 columns
cleaning_df.fillna(-1)
| asset | btc | eth |
|---|---|---|
| date | ||
| 2010-07-18 | 0.085840 | -1.000000 |
| 2010-07-19 | 0.080800 | -1.000000 |
| 2010-07-20 | 0.074736 | -1.000000 |
| 2010-07-21 | 0.079193 | -1.000000 |
| 2010-07-22 | 0.058470 | -1.000000 |
| ... | ... | ... |
| 2022-12-07 | 16848.251824 | 1232.552454 |
| 2022-12-08 | 17231.456322 | 1281.633052 |
| 2022-12-09 | 17136.149001 | 1263.208619 |
| 2022-12-10 | 17124.214680 | 1266.771442 |
| 2022-12-11 | 17094.360659 | 1263.069667 |
4530 rows × 2 columns
cleaning_df.fillna(method="ffill")
| asset | btc | eth |
|---|---|---|
| date | ||
| 2010-07-18 | 0.085840 | NaN |
| 2010-07-19 | 0.080800 | NaN |
| 2010-07-20 | 0.074736 | NaN |
| 2010-07-21 | 0.079193 | NaN |
| 2010-07-22 | 0.058470 | NaN |
| ... | ... | ... |
| 2022-12-07 | 16848.251824 | 1232.552454 |
| 2022-12-08 | 17231.456322 | 1281.633052 |
| 2022-12-09 | 17136.149001 | 1263.208619 |
| 2022-12-10 | 17124.214680 | 1266.771442 |
| 2022-12-11 | 17094.360659 | 1263.069667 |
4530 rows × 2 columns
cleaning_df.fillna(method="bfill")
| asset | btc | eth |
|---|---|---|
| date | ||
| 2010-07-18 | 0.085840 | 1.199990 |
| 2010-07-19 | 0.080800 | 1.199990 |
| 2010-07-20 | 0.074736 | 1.199990 |
| 2010-07-21 | 0.079193 | 1.199990 |
| 2010-07-22 | 0.058470 | 1.199990 |
| ... | ... | ... |
| 2022-12-07 | 16848.251824 | 1232.552454 |
| 2022-12-08 | 17231.456322 | 1281.633052 |
| 2022-12-09 | 17136.149001 | 1263.208619 |
| 2022-12-10 | 17124.214680 | 1266.771442 |
| 2022-12-11 | 17094.360659 | 1263.069667 |
4530 rows × 2 columns
# setup df for interpolation
interp_df = cleaning_df.iloc[cleaning_df.shape[0] - 5:, :].copy()
interp_df["btc_og"] = interp_df["btc"]
interp_df["eth_og"] = interp_df["eth"]
interp_df.iloc[1, 0:2] = [np.nan ,np.nan]
interp_df.interpolate(method="linear")
| asset | btc | eth | btc_og | eth_og |
|---|---|---|---|---|
| date | ||||
| 2022-12-07 | 16848.251824 | 1232.552454 | 16848.251824 | 1232.552454 |
| 2022-12-08 | 16992.200413 | 1247.880537 | 17231.456322 | 1281.633052 |
| 2022-12-09 | 17136.149001 | 1263.208619 | 17136.149001 | 1263.208619 |
| 2022-12-10 | 17124.214680 | 1266.771442 | 17124.214680 | 1266.771442 |
| 2022-12-11 | 17094.360659 | 1263.069667 | 17094.360659 | 1263.069667 |
Selecting/Sampling#
test_df.select_dtypes(include='float64')
| AdrActCnt | PriceUSD | |
|---|---|---|
| 0 | 860.0 | 0.085840 |
| 1 | 929.0 | 0.080800 |
| 2 | 936.0 | 0.074736 |
| 3 | 784.0 | 0.079193 |
| 4 | 594.0 | 0.058470 |
| ... | ... | ... |
| 7208 | 517421.0 | 1232.552454 |
| 7209 | 528927.0 | 1281.633052 |
| 7210 | 1505652.0 | 1263.208619 |
| 7211 | 733795.0 | 1266.771442 |
| 7212 | 723883.0 | 1263.069667 |
7213 rows × 2 columns
# test_df.sample(n = 200)
test_df.sample(frac = 0.25, random_state=42)
| asset | time | AdrActCnt | PriceUSD | datetime | dayname | date | |
|---|---|---|---|---|---|---|---|
| 308 | btc | 2011-05-22T00:00:00.000000000Z | 10846.0 | 6.711956 | 2011-05-22 00:00:00+00:00 | Sunday | 2011-05-22 |
| 381 | btc | 2011-08-03T00:00:00.000000000Z | 18614.0 | 9.286357 | 2011-08-03 00:00:00+00:00 | Wednesday | 2011-08-03 |
| 5716 | eth | 2018-11-06T00:00:00.000000000Z | 278348.0 | 218.089756 | 2018-11-06 00:00:00+00:00 | Tuesday | 2018-11-06 |
| 2312 | btc | 2016-11-15T00:00:00.000000000Z | 651339.0 | 712.347153 | 2016-11-15 00:00:00+00:00 | Tuesday | 2016-11-15 |
| 251 | btc | 2011-03-26T00:00:00.000000000Z | 4838.0 | 0.855200 | 2011-03-26 00:00:00+00:00 | Saturday | 2011-03-26 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 5314 | eth | 2017-09-30T00:00:00.000000000Z | 150248.0 | 301.653575 | 2017-09-30 00:00:00+00:00 | Saturday | 2017-09-30 |
| 5546 | eth | 2018-05-20T00:00:00.000000000Z | 389190.0 | 713.591691 | 2018-05-20 00:00:00+00:00 | Sunday | 2018-05-20 |
| 6894 | eth | 2022-01-27T00:00:00.000000000Z | 537712.0 | 2412.624470 | 2022-01-27 00:00:00+00:00 | Thursday | 2022-01-27 |
| 2118 | btc | 2016-05-05T00:00:00.000000000Z | 455031.0 | 449.336072 | 2016-05-05 00:00:00+00:00 | Thursday | 2016-05-05 |
| 1235 | btc | 2013-12-04T00:00:00.000000000Z | 186429.0 | 1134.932231 | 2013-12-04 00:00:00+00:00 | Wednesday | 2013-12-04 |
1803 rows × 7 columns
Boolean Selection#
bool_df = test_df[["date", "asset", "PriceUSD"]].pivot(index="date", columns="asset", values="PriceUSD")
bool_df
| asset | btc | eth |
|---|---|---|
| date | ||
| 2010-07-18 | 0.085840 | NaN |
| 2010-07-19 | 0.080800 | NaN |
| 2010-07-20 | 0.074736 | NaN |
| 2010-07-21 | 0.079193 | NaN |
| 2010-07-22 | 0.058470 | NaN |
| ... | ... | ... |
| 2022-12-07 | 16848.251824 | 1232.552454 |
| 2022-12-08 | 17231.456322 | 1281.633052 |
| 2022-12-09 | 17136.149001 | 1263.208619 |
| 2022-12-10 | 17124.214680 | 1266.771442 |
| 2022-12-11 | 17094.360659 | 1263.069667 |
4530 rows × 2 columns
# returns Series of same shape w/ np.NaN at failing rows (default)
# bool_df['PriceUSD'].where(bool_df['PriceUSD'] > 10**4) # returns np.Nan in failing rows
bool_df['eth'].where(bool_df['eth'] > 10**3, 0) # returns 0 in failing rows
date
2010-07-18 0.000000
2010-07-19 0.000000
2010-07-20 0.000000
2010-07-21 0.000000
2010-07-22 0.000000
...
2022-12-07 1232.552454
2022-12-08 1281.633052
2022-12-09 1263.208619
2022-12-10 1266.771442
2022-12-11 1263.069667
Name: eth, Length: 4530, dtype: float64
test_df["asset"].isin(["btc"])
0 True
1 True
2 True
3 True
4 True
...
7208 False
7209 False
7210 False
7211 False
7212 False
Name: asset, Length: 7213, dtype: bool
na_check_df = bool_df.isna()
na_check_series = na_check_df.any(axis=1) # aggregate booleans
bool_df.loc[na_check_series, :]
| asset | btc | eth |
|---|---|---|
| date | ||
| 2010-07-18 | 0.085840 | NaN |
| 2010-07-19 | 0.080800 | NaN |
| 2010-07-20 | 0.074736 | NaN |
| 2010-07-21 | 0.079193 | NaN |
| 2010-07-22 | 0.058470 | NaN |
| ... | ... | ... |
| 2015-08-03 | 282.185052 | NaN |
| 2015-08-04 | 285.286617 | NaN |
| 2015-08-05 | 282.338887 | NaN |
| 2015-08-06 | 278.995749 | NaN |
| 2015-08-07 | 279.488715 | NaN |
1847 rows × 2 columns
bool_df.loc[(bool_df['eth'] > 10**3), :]
| asset | btc | eth |
|---|---|---|
| date | ||
| 2018-01-06 | 17103.589280 | 1006.157475 |
| 2018-01-07 | 16231.694999 | 1102.889537 |
| 2018-01-08 | 14937.415089 | 1134.699060 |
| 2018-01-09 | 14378.586217 | 1288.406875 |
| 2018-01-10 | 14669.088266 | 1231.767295 |
| ... | ... | ... |
| 2022-12-07 | 16848.251824 | 1232.552454 |
| 2022-12-08 | 17231.456322 | 1281.633052 |
| 2022-12-09 | 17136.149001 | 1263.208619 |
| 2022-12-10 | 17124.214680 | 1266.771442 |
| 2022-12-11 | 17094.360659 | 1263.069667 |
731 rows × 2 columns
Boolean Operators#
bool_df.loc[~(bool_df['eth'] > 10**3), :] # NOT
| asset | btc | eth |
|---|---|---|
| date | ||
| 2010-07-18 | 0.085840 | NaN |
| 2010-07-19 | 0.080800 | NaN |
| 2010-07-20 | 0.074736 | NaN |
| 2010-07-21 | 0.079193 | NaN |
| 2010-07-22 | 0.058470 | NaN |
| ... | ... | ... |
| 2020-12-31 | 29022.671413 | 739.025850 |
| 2021-01-01 | 29380.693733 | 730.914321 |
| 2021-01-02 | 32022.681058 | 775.296622 |
| 2021-01-03 | 33277.835305 | 990.365325 |
| 2022-06-18 | 19013.867254 | 992.790097 |
3799 rows × 2 columns
bool_df.loc[(bool_df['eth'] > 10**3) & (bool_df['eth'] > 10), :] # AND
| asset | btc | eth |
|---|---|---|
| date | ||
| 2018-01-06 | 17103.589280 | 1006.157475 |
| 2018-01-07 | 16231.694999 | 1102.889537 |
| 2018-01-08 | 14937.415089 | 1134.699060 |
| 2018-01-09 | 14378.586217 | 1288.406875 |
| 2018-01-10 | 14669.088266 | 1231.767295 |
| ... | ... | ... |
| 2022-12-07 | 16848.251824 | 1232.552454 |
| 2022-12-08 | 17231.456322 | 1281.633052 |
| 2022-12-09 | 17136.149001 | 1263.208619 |
| 2022-12-10 | 17124.214680 | 1266.771442 |
| 2022-12-11 | 17094.360659 | 1263.069667 |
731 rows × 2 columns
bool_df.loc[(bool_df['eth'] > 10**3) | (bool_df['eth'] > 10), :] # OR
| asset | btc | eth |
|---|---|---|
| date | ||
| 2016-03-05 | 397.314454 | 10.751006 |
| 2016-03-06 | 403.705187 | 10.984407 |
| 2016-03-09 | 412.540704 | 11.817099 |
| 2016-03-10 | 416.340929 | 11.165651 |
| 2016-03-11 | 419.511934 | 11.122905 |
| ... | ... | ... |
| 2022-12-07 | 16848.251824 | 1232.552454 |
| 2022-12-08 | 17231.456322 | 1281.633052 |
| 2022-12-09 | 17136.149001 | 1263.208619 |
| 2022-12-10 | 17124.214680 | 1266.771442 |
| 2022-12-11 | 17094.360659 | 1263.069667 |
2380 rows × 2 columns
Datetime#
Python datetime <-> string formatting here
Datetime to string#
Python:
test_date_str = "2022-01-01"
print(dt.datetime.strptime(test_date_str, "%Y-%m-%d"))
print(dt.datetime.strptime(test_date_str, "%Y-%m-%d").date())
2022-01-01 00:00:00
2022-01-01
Pandas:
test_df["datetime"] = pd.to_datetime(test_df["time"], utc=True) # str timestamp to datetime
test_df['datetime_alt'] = pd.to_datetime(test_df["time"], format='%Y-%m-%dT%H:%M:%S.%fZ', utc=True)
test_df["date"] = pd.to_datetime(test_df["time"], utc=True).dt.date # datetime to date
test_df['datetime_str'] = test_df["datetime"].dt.strftime('%Y-%m-%dT%H:%M:%S.%fZ') # datetime to str
test_df['date_str'] = pd.to_datetime(test_df["date"]).dt.strftime('%Y-%m-%d') # date to str
test_df[["datetime", "datetime_alt", "date", "datetime_str", "date_str"]]
| datetime | datetime_alt | date | datetime_str | date_str | |
|---|---|---|---|---|---|
| 0 | 2010-07-18 00:00:00+00:00 | 2010-07-18 00:00:00+00:00 | 2010-07-18 | 2010-07-18T00:00:00.000000Z | 2010-07-18 |
| 1 | 2010-07-19 00:00:00+00:00 | 2010-07-19 00:00:00+00:00 | 2010-07-19 | 2010-07-19T00:00:00.000000Z | 2010-07-19 |
| 2 | 2010-07-20 00:00:00+00:00 | 2010-07-20 00:00:00+00:00 | 2010-07-20 | 2010-07-20T00:00:00.000000Z | 2010-07-20 |
| 3 | 2010-07-21 00:00:00+00:00 | 2010-07-21 00:00:00+00:00 | 2010-07-21 | 2010-07-21T00:00:00.000000Z | 2010-07-21 |
| 4 | 2010-07-22 00:00:00+00:00 | 2010-07-22 00:00:00+00:00 | 2010-07-22 | 2010-07-22T00:00:00.000000Z | 2010-07-22 |
| ... | ... | ... | ... | ... | ... |
| 7208 | 2022-12-07 00:00:00+00:00 | 2022-12-07 00:00:00+00:00 | 2022-12-07 | 2022-12-07T00:00:00.000000Z | 2022-12-07 |
| 7209 | 2022-12-08 00:00:00+00:00 | 2022-12-08 00:00:00+00:00 | 2022-12-08 | 2022-12-08T00:00:00.000000Z | 2022-12-08 |
| 7210 | 2022-12-09 00:00:00+00:00 | 2022-12-09 00:00:00+00:00 | 2022-12-09 | 2022-12-09T00:00:00.000000Z | 2022-12-09 |
| 7211 | 2022-12-10 00:00:00+00:00 | 2022-12-10 00:00:00+00:00 | 2022-12-10 | 2022-12-10T00:00:00.000000Z | 2022-12-10 |
| 7212 | 2022-12-11 00:00:00+00:00 | 2022-12-11 00:00:00+00:00 | 2022-12-11 | 2022-12-11T00:00:00.000000Z | 2022-12-11 |
7213 rows × 5 columns
Datetime Altering#
Python:
test_date_str = "2022-01-01"
test_dt = dt.datetime.strptime(test_date_str, "%Y-%m-%d")
print(test_dt + dt.timedelta(days=2))
print(test_dt + relativedelta(years=5, months=4))
2022-01-03 00:00:00
2027-05-01 00:00:00
Pandas:
# (also pd.Timedelta)
test_df["date_offset"] = test_df["date"] + pd.DateOffset(years=2, months=2, days=1)
test_df[["date", "date_offset"]]
| date | date_offset | |
|---|---|---|
| 0 | 2010-07-18 | 2012-09-19 |
| 1 | 2010-07-19 | 2012-09-20 |
| 2 | 2010-07-20 | 2012-09-21 |
| 3 | 2010-07-21 | 2012-09-22 |
| 4 | 2010-07-22 | 2012-09-23 |
| ... | ... | ... |
| 7208 | 2022-12-07 | 2025-02-08 |
| 7209 | 2022-12-08 | 2025-02-09 |
| 7210 | 2022-12-09 | 2025-02-10 |
| 7211 | 2022-12-10 | 2025-02-11 |
| 7212 | 2022-12-11 | 2025-02-12 |
7213 rows × 2 columns
Date Parts (Week & Weekday)#
Python:
test_date_str = "2022-01-01"
test_dt = dt.datetime.strptime(test_date_str, "%Y-%m-%d")
print(test_dt.isocalendar().week)
print(test_dt.weekday())
print(test_dt.strftime('%A')) # get day name
52
5
Saturday
Pandas:
test_df["week"] = test_df["datetime"].dt.isocalendar().week
test_df["weekday"] = test_df["datetime"].dt.weekday
test_df["dayname"] = test_df["datetime"].dt.day_name()
test_df["monthname"] = test_df["datetime"].dt.month_name()
test_df["year"] = test_df["datetime"].dt.year
test_df[["date", "week", "weekday", "dayname", "monthname", "year"]]
| date | week | weekday | dayname | monthname | year | |
|---|---|---|---|---|---|---|
| 0 | 2010-07-18 | 28 | 6 | Sunday | July | 2010 |
| 1 | 2010-07-19 | 29 | 0 | Monday | July | 2010 |
| 2 | 2010-07-20 | 29 | 1 | Tuesday | July | 2010 |
| 3 | 2010-07-21 | 29 | 2 | Wednesday | July | 2010 |
| 4 | 2010-07-22 | 29 | 3 | Thursday | July | 2010 |
| ... | ... | ... | ... | ... | ... | ... |
| 7208 | 2022-12-07 | 49 | 2 | Wednesday | December | 2022 |
| 7209 | 2022-12-08 | 49 | 3 | Thursday | December | 2022 |
| 7210 | 2022-12-09 | 49 | 4 | Friday | December | 2022 |
| 7211 | 2022-12-10 | 49 | 5 | Saturday | December | 2022 |
| 7212 | 2022-12-11 | 49 | 6 | Sunday | December | 2022 |
7213 rows × 6 columns
Numerical#
num_df = test_df.loc[3000:, ["date", "asset", "PriceUSD"]].copy()
num_df["PriceUSD_rnd"] = num_df["PriceUSD"].round()
num_df["PriceUSD_rnd1"] = num_df["PriceUSD"].round(1)
num_df["PriceUSD_floor"] = np.floor(num_df["PriceUSD"])
num_df["PriceUSD_ceil"] = np.ceil(num_df["PriceUSD"])
num_df
| date | asset | PriceUSD | PriceUSD_rnd | PriceUSD_rnd1 | PriceUSD_floor | PriceUSD_ceil | |
|---|---|---|---|---|---|---|---|
| 3000 | 2018-10-04 | btc | 6545.285668 | 6545.0 | 6545.3 | 6545.0 | 6546.0 |
| 3001 | 2018-10-05 | btc | 6585.149580 | 6585.0 | 6585.1 | 6585.0 | 6586.0 |
| 3002 | 2018-10-06 | btc | 6550.478316 | 6550.0 | 6550.5 | 6550.0 | 6551.0 |
| 3003 | 2018-10-07 | btc | 6564.568260 | 6565.0 | 6564.6 | 6564.0 | 6565.0 |
| 3004 | 2018-10-08 | btc | 6604.685274 | 6605.0 | 6604.7 | 6604.0 | 6605.0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 7208 | 2022-12-07 | eth | 1232.552454 | 1233.0 | 1232.6 | 1232.0 | 1233.0 |
| 7209 | 2022-12-08 | eth | 1281.633052 | 1282.0 | 1281.6 | 1281.0 | 1282.0 |
| 7210 | 2022-12-09 | eth | 1263.208619 | 1263.0 | 1263.2 | 1263.0 | 1264.0 |
| 7211 | 2022-12-10 | eth | 1266.771442 | 1267.0 | 1266.8 | 1266.0 | 1267.0 |
| 7212 | 2022-12-11 | eth | 1263.069667 | 1263.0 | 1263.1 | 1263.0 | 1264.0 |
4213 rows × 7 columns
Transforms#
Indexes#
df.set_index(keys, drop=True, verify_integrity=False)
df.reset_index(drop=False)
df.reindex()
Pivot & Melt#
pd.unstack() - pivot multilevel index
Pivot to MultiIndex#
pivot_df = test_df.pivot(index="date", columns="asset", values=['AdrActCnt', 'PriceUSD'])
pivot_df.reset_index(drop=False, inplace=True)
pivot_df
| date | AdrActCnt | PriceUSD | |||
|---|---|---|---|---|---|
| asset | btc | eth | btc | eth | |
| 0 | 2010-07-18 | 860.0 | NaN | 0.085840 | NaN |
| 1 | 2010-07-19 | 929.0 | NaN | 0.080800 | NaN |
| 2 | 2010-07-20 | 936.0 | NaN | 0.074736 | NaN |
| 3 | 2010-07-21 | 784.0 | NaN | 0.079193 | NaN |
| 4 | 2010-07-22 | 594.0 | NaN | 0.058470 | NaN |
| ... | ... | ... | ... | ... | ... |
| 4525 | 2022-12-07 | 899850.0 | 517421.0 | 16848.251824 | 1232.552454 |
| 4526 | 2022-12-08 | 905789.0 | 528927.0 | 17231.456322 | 1281.633052 |
| 4527 | 2022-12-09 | 946767.0 | 1505652.0 | 17136.149001 | 1263.208619 |
| 4528 | 2022-12-10 | 909506.0 | 733795.0 | 17124.214680 | 1266.771442 |
| 4529 | 2022-12-11 | 781176.0 | 723883.0 | 17094.360659 | 1263.069667 |
4530 rows × 5 columns
Melt from MultiIndex#
# pivot_df = pd.melt(pivot_df, col_level=0, id_vars=["date"])
pivot_df = pd.melt(pivot_df, id_vars=[("date", "")])
pivot_df.columns = ["date", "metric", "asset", "value"]
pivot_df
| date | metric | asset | value | |
|---|---|---|---|---|
| 0 | 2010-07-18 | AdrActCnt | btc | 860.000000 |
| 1 | 2010-07-19 | AdrActCnt | btc | 929.000000 |
| 2 | 2010-07-20 | AdrActCnt | btc | 936.000000 |
| 3 | 2010-07-21 | AdrActCnt | btc | 784.000000 |
| 4 | 2010-07-22 | AdrActCnt | btc | 594.000000 |
| ... | ... | ... | ... | ... |
| 18115 | 2022-12-07 | PriceUSD | eth | 1232.552454 |
| 18116 | 2022-12-08 | PriceUSD | eth | 1281.633052 |
| 18117 | 2022-12-09 | PriceUSD | eth | 1263.208619 |
| 18118 | 2022-12-10 | PriceUSD | eth | 1266.771442 |
| 18119 | 2022-12-11 | PriceUSD | eth | 1263.069667 |
18120 rows × 4 columns
Pivot back to OG (single Index)#
pivot_df = pivot_df.pivot(index=["date", "asset"], columns="metric", values="value")
pivot_df.columns = pivot_df.columns.rename("")
pivot_df.reset_index(drop=False, inplace=True)
pivot_df
| date | asset | AdrActCnt | PriceUSD | |
|---|---|---|---|---|
| 0 | 2010-07-18 | btc | 860.0 | 0.085840 |
| 1 | 2010-07-18 | eth | NaN | NaN |
| 2 | 2010-07-19 | btc | 929.0 | 0.080800 |
| 3 | 2010-07-19 | eth | NaN | NaN |
| 4 | 2010-07-20 | btc | 936.0 | 0.074736 |
| ... | ... | ... | ... | ... |
| 9055 | 2022-12-09 | eth | 1505652.0 | 1263.208619 |
| 9056 | 2022-12-10 | btc | 909506.0 | 17124.214680 |
| 9057 | 2022-12-10 | eth | 733795.0 | 1266.771442 |
| 9058 | 2022-12-11 | btc | 781176.0 | 17094.360659 |
| 9059 | 2022-12-11 | eth | 723883.0 | 1263.069667 |
9060 rows × 4 columns
Pivot to Date Index (Fill missing dates)#
date_rng = pd.date_range(test_df["date"].min(), test_df["date"].max())
pivot_fill_df = test_df.pivot(index="date", columns="asset", values='PriceUSD')
# pivot_fill_df = pivot_fill_df.drop(labels=[1, 3, 4524], axis=0) # drop by index value
pivot_fill_df = pivot_fill_df.drop(pivot_fill_df.index[[1, 3, 4524]]) # drop by row num
pivot_fill_df = pivot_fill_df.reindex(date_rng, fill_value=np.nan)
pivot_fill_df
| asset | btc | eth |
|---|---|---|
| 2010-07-18 | 0.085840 | NaN |
| 2010-07-19 | NaN | NaN |
| 2010-07-20 | 0.074736 | NaN |
| 2010-07-21 | NaN | NaN |
| 2010-07-22 | 0.058470 | NaN |
| ... | ... | ... |
| 2022-12-07 | 16848.251824 | 1232.552454 |
| 2022-12-08 | 17231.456322 | 1281.633052 |
| 2022-12-09 | 17136.149001 | 1263.208619 |
| 2022-12-10 | 17124.214680 | 1266.771442 |
| 2022-12-11 | 17094.360659 | 1263.069667 |
4530 rows × 2 columns
Join & Merge#
article on when to use join vs merge
Merge:
can join on indices or columns
validate - check 1:1, 1:many, etc. (also available for join)
indicator - produces additional column to indicate “left_only”, “right_only”, or “both”
join_merge_df = test_df.loc[3000:, ["date", "datetime", "asset", 'AdrActCnt', 'PriceUSD']].copy()
join_merge_df = join_merge_df.sort_values(by="date").reset_index(drop=True)
join_merge_df
| date | datetime | asset | AdrActCnt | PriceUSD | |
|---|---|---|---|---|---|
| 0 | 2015-08-08 | 2015-08-08 00:00:00+00:00 | eth | 1208.0 | 1.199990 |
| 1 | 2015-08-09 | 2015-08-09 00:00:00+00:00 | eth | 1113.0 | 1.199990 |
| 2 | 2015-08-10 | 2015-08-10 00:00:00+00:00 | eth | 1430.0 | 1.199990 |
| 3 | 2015-08-11 | 2015-08-11 00:00:00+00:00 | eth | 2697.0 | 0.990000 |
| 4 | 2015-08-12 | 2015-08-12 00:00:00+00:00 | eth | 1219.0 | 1.288000 |
| ... | ... | ... | ... | ... | ... |
| 4208 | 2022-12-09 | 2022-12-09 00:00:00+00:00 | eth | 1505652.0 | 1263.208619 |
| 4209 | 2022-12-10 | 2022-12-10 00:00:00+00:00 | eth | 733795.0 | 1266.771442 |
| 4210 | 2022-12-10 | 2022-12-10 00:00:00+00:00 | btc | 909506.0 | 17124.214680 |
| 4211 | 2022-12-11 | 2022-12-11 00:00:00+00:00 | btc | 781176.0 | 17094.360659 |
| 4212 | 2022-12-11 | 2022-12-11 00:00:00+00:00 | eth | 723883.0 | 1263.069667 |
4213 rows × 5 columns
join_cols = ["date", "asset"]
join_merge_df1 = join_merge_df.loc[:3000, join_cols + ["AdrActCnt"]]
join_merge_df2 = join_merge_df.loc[:, join_cols + ["PriceUSD"]]
print(join_merge_df1)
print(join_merge_df2)
date asset AdrActCnt
0 2015-08-08 eth 1208.0
1 2015-08-09 eth 1113.0
2 2015-08-10 eth 1430.0
3 2015-08-11 eth 2697.0
4 2015-08-12 eth 1219.0
... ... ... ...
2996 2021-04-12 btc 1126723.0
2997 2021-04-13 eth 641476.0
2998 2021-04-13 btc 1178027.0
2999 2021-04-14 eth 662089.0
3000 2021-04-14 btc 1149773.0
[3001 rows x 3 columns]
date asset PriceUSD
0 2015-08-08 eth 1.199990
1 2015-08-09 eth 1.199990
2 2015-08-10 eth 1.199990
3 2015-08-11 eth 0.990000
4 2015-08-12 eth 1.288000
... ... ... ...
4208 2022-12-09 eth 1263.208619
4209 2022-12-10 eth 1266.771442
4210 2022-12-10 btc 17124.214680
4211 2022-12-11 btc 17094.360659
4212 2022-12-11 eth 1263.069667
[4213 rows x 3 columns]
joined_df = join_merge_df1.join(join_merge_df2.set_index(keys=join_cols), how="outer", on=join_cols)
joined_df
| date | asset | AdrActCnt | PriceUSD | |
|---|---|---|---|---|
| 0 | 2015-08-08 | eth | 1208.0 | 1.199990 |
| 1 | 2015-08-09 | eth | 1113.0 | 1.199990 |
| 2 | 2015-08-10 | eth | 1430.0 | 1.199990 |
| 3 | 2015-08-11 | eth | 2697.0 | 0.990000 |
| 4 | 2015-08-12 | eth | 1219.0 | 1.288000 |
| ... | ... | ... | ... | ... |
| 3000 | 2022-12-09 | eth | NaN | 1263.208619 |
| 3000 | 2022-12-10 | eth | NaN | 1266.771442 |
| 3000 | 2022-12-10 | btc | NaN | 17124.214680 |
| 3000 | 2022-12-11 | btc | NaN | 17094.360659 |
| 3000 | 2022-12-11 | eth | NaN | 1263.069667 |
4213 rows × 4 columns
merged_df = join_merge_df1.merge(join_merge_df2, how="outer", on=join_cols, indicator=True)
merged_df
| date | asset | AdrActCnt | PriceUSD | _merge | |
|---|---|---|---|---|---|
| 0 | 2015-08-08 | eth | 1208.0 | 1.199990 | both |
| 1 | 2015-08-09 | eth | 1113.0 | 1.199990 | both |
| 2 | 2015-08-10 | eth | 1430.0 | 1.199990 | both |
| 3 | 2015-08-11 | eth | 2697.0 | 0.990000 | both |
| 4 | 2015-08-12 | eth | 1219.0 | 1.288000 | both |
| ... | ... | ... | ... | ... | ... |
| 4208 | 2022-12-09 | eth | NaN | 1263.208619 | right_only |
| 4209 | 2022-12-10 | eth | NaN | 1266.771442 | right_only |
| 4210 | 2022-12-10 | btc | NaN | 17124.214680 | right_only |
| 4211 | 2022-12-11 | btc | NaN | 17094.360659 | right_only |
| 4212 | 2022-12-11 | eth | NaN | 1263.069667 | right_only |
4213 rows × 5 columns
Explode#
explode_df = pd.DataFrame({"city": ['A', 'B', 'C'],
"day1": [22, 25, 21],
'day2':[31, 12, 67],
'day3': [27, 20, 15],
'day4': [34, 37, [41, 45, 67, 90, 21]],
'day5': [23, 54, 36]})
explode_df
| city | day1 | day2 | day3 | day4 | day5 | |
|---|---|---|---|---|---|---|
| 0 | A | 22 | 31 | 27 | 34 | 23 |
| 1 | B | 25 | 12 | 20 | 37 | 54 |
| 2 | C | 21 | 67 | 15 | [41, 45, 67, 90, 21] | 36 |
explode_df.explode("day4", ignore_index=False)
| city | day1 | day2 | day3 | day4 | day5 | |
|---|---|---|---|---|---|---|
| 0 | A | 22 | 31 | 27 | 34 | 23 |
| 1 | B | 25 | 12 | 20 | 37 | 54 |
| 2 | C | 21 | 67 | 15 | 41 | 36 |
| 3 | C | 21 | 67 | 15 | 45 | 36 |
| 4 | C | 21 | 67 | 15 | 67 | 36 |
| 5 | C | 21 | 67 | 15 | 90 | 36 |
| 6 | C | 21 | 67 | 15 | 21 | 36 |
Aggregation#
Aggregation functions#
mean(): Compute mean of groups
sum(): Compute sum of group values
size(): Compute group sizes
count(): Compute count of group
std(): Standard deviation of groups
var(): Compute variance of groups
sem(): Standard error of the mean of groups
first(): Compute first of group values
last(): Compute last of group values
nth() : Take nth value, or a subset if n is a list
min(): Compute min of group values
max(): Compute max of group values
agg_df = test_df[test_metrics]
agg_df.count()
AdrActCnt 7213
PriceUSD 7213
dtype: int64
agg_df.nunique()
AdrActCnt 7152
PriceUSD 7162
dtype: int64
agg_df.median()
AdrActCnt 406267.000000
PriceUSD 467.321788
dtype: float64
agg_df.quantile(q=0.10)
AdrActCnt 14612.000000
PriceUSD 5.476572
Name: 0.1, dtype: float64
Groupby#
DataFrame.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=_NoDefault.no_default, observed=False, dropna=True)
group_df = test_df[["date", "year", "asset"] + test_metrics]
group_df.groupby(by="asset").nunique()
| date | year | AdrActCnt | PriceUSD | |
|---|---|---|---|---|
| asset | ||||
| btc | 4530 | 13 | 4504 | 4507 |
| eth | 2683 | 8 | 2667 | 2658 |
group_df.groupby(by=["year", "asset"]).nunique()
| date | AdrActCnt | PriceUSD | ||
|---|---|---|---|---|
| year | asset | |||
| 2010 | btc | 167 | 151 | 154 |
| 2011 | btc | 365 | 363 | 356 |
| 2012 | btc | 366 | 363 | 366 |
| 2013 | btc | 365 | 365 | 365 |
| 2014 | btc | 365 | 365 | 365 |
| ... | ... | ... | ... | ... |
| 2020 | eth | 366 | 365 | 366 |
| 2021 | btc | 365 | 364 | 365 |
| eth | 365 | 365 | 365 | |
| 2022 | btc | 345 | 345 | 345 |
| eth | 345 | 344 | 345 |
21 rows × 3 columns
Groupby Map#
agg_map = {'AdrActCnt':['count', 'nunique'],
'PriceUSD':['max', 'min', lambda x: x.max() - x.min()]
}
group_df.groupby(by=["year", "asset"]).agg(agg_map)
| AdrActCnt | PriceUSD | |||||
|---|---|---|---|---|---|---|
| count | nunique | max | min | <lambda_0> | ||
| year | asset | |||||
| 2010 | btc | 167 | 151 | 0.400982 | 0.050541 | 0.350442 |
| 2011 | btc | 365 | 363 | 29.029921 | 0.295000 | 28.734921 |
| 2012 | btc | 366 | 363 | 13.755252 | 4.255239 | 9.500012 |
| 2013 | btc | 365 | 365 | 1134.932231 | 13.280607 | 1121.651624 |
| 2014 | btc | 365 | 365 | 914.459961 | 310.442004 | 604.017957 |
| ... | ... | ... | ... | ... | ... | ... |
| 2020 | eth | 366 | 365 | 751.208064 | 110.328202 | 640.879862 |
| 2021 | btc | 365 | 364 | 67541.755508 | 29380.693733 | 38161.061775 |
| eth | 365 | 365 | 4811.156463 | 730.914321 | 4080.242142 | |
| 2022 | btc | 345 | 345 | 47560.009382 | 15758.291282 | 31801.718100 |
| eth | 345 | 344 | 3832.365610 | 992.790097 | 2839.575513 | |
21 rows × 5 columns
Groupby Datetime#
DataFrame.groupby(pd.Grouper(key=”dfgfgdf”, axis=0, freq=’M’))
group_dt_df = test_df.pivot(index="datetime", columns="asset", values=test_metrics)
group_dt_df
| AdrActCnt | PriceUSD | |||
|---|---|---|---|---|
| asset | btc | eth | btc | eth |
| datetime | ||||
| 2010-07-18 00:00:00+00:00 | 860.0 | NaN | 0.085840 | NaN |
| 2010-07-19 00:00:00+00:00 | 929.0 | NaN | 0.080800 | NaN |
| 2010-07-20 00:00:00+00:00 | 936.0 | NaN | 0.074736 | NaN |
| 2010-07-21 00:00:00+00:00 | 784.0 | NaN | 0.079193 | NaN |
| 2010-07-22 00:00:00+00:00 | 594.0 | NaN | 0.058470 | NaN |
| ... | ... | ... | ... | ... |
| 2022-12-07 00:00:00+00:00 | 899850.0 | 517421.0 | 16848.251824 | 1232.552454 |
| 2022-12-08 00:00:00+00:00 | 905789.0 | 528927.0 | 17231.456322 | 1281.633052 |
| 2022-12-09 00:00:00+00:00 | 946767.0 | 1505652.0 | 17136.149001 | 1263.208619 |
| 2022-12-10 00:00:00+00:00 | 909506.0 | 733795.0 | 17124.214680 | 1266.771442 |
| 2022-12-11 00:00:00+00:00 | 781176.0 | 723883.0 | 17094.360659 | 1263.069667 |
4530 rows × 4 columns
group_dt_df.groupby(pd.Grouper(axis=0, freq='M')).last()
| AdrActCnt | PriceUSD | |||
|---|---|---|---|---|
| asset | btc | eth | btc | eth |
| datetime | ||||
| 2010-07-31 00:00:00+00:00 | 479.0 | NaN | 0.067546 | NaN |
| 2010-08-31 00:00:00+00:00 | 733.0 | NaN | 0.060000 | NaN |
| 2010-09-30 00:00:00+00:00 | 640.0 | NaN | 0.061900 | NaN |
| 2010-10-31 00:00:00+00:00 | 609.0 | NaN | 0.192500 | NaN |
| 2010-11-30 00:00:00+00:00 | 817.0 | NaN | 0.208200 | NaN |
| ... | ... | ... | ... | ... |
| 2022-08-31 00:00:00+00:00 | 916812.0 | 467318.0 | 20024.671606 | 1551.729221 |
| 2022-09-30 00:00:00+00:00 | 1005499.0 | 484536.0 | 19435.194692 | 1327.075996 |
| 2022-10-31 00:00:00+00:00 | 980614.0 | 476137.0 | 20490.858181 | 1571.203871 |
| 2022-11-30 00:00:00+00:00 | 946754.0 | 525545.0 | 17176.898691 | 1298.039420 |
| 2022-12-31 00:00:00+00:00 | 781176.0 | 723883.0 | 17094.360659 | 1263.069667 |
150 rows × 4 columns
group_dt_df.groupby(pd.Grouper(axis=0, freq='Y')).first()
| AdrActCnt | PriceUSD | |||
|---|---|---|---|---|
| asset | btc | eth | btc | eth |
| datetime | ||||
| 2010-12-31 00:00:00+00:00 | 860.0 | NaN | 0.085840 | NaN |
| 2011-12-31 00:00:00+00:00 | 1071.0 | NaN | 0.300000 | NaN |
| 2012-12-31 00:00:00+00:00 | 11474.0 | NaN | 5.294843 | NaN |
| 2013-12-31 00:00:00+00:00 | 38733.0 | NaN | 13.331371 | NaN |
| 2014-12-31 00:00:00+00:00 | 96516.0 | NaN | 752.404550 | NaN |
| ... | ... | ... | ... | ... |
| 2018-12-31 00:00:00+00:00 | 972783.0 | 520683.0 | 13464.653612 | 756.071766 |
| 2019-12-31 00:00:00+00:00 | 433715.0 | 227755.0 | 3808.117832 | 139.154644 |
| 2020-12-31 00:00:00+00:00 | 524360.0 | 231794.0 | 7170.631869 | 129.963875 |
| 2021-12-31 00:00:00+00:00 | 1001890.0 | 511250.0 | 29380.693733 | 730.914321 |
| 2022-12-31 00:00:00+00:00 | 695722.0 | 581311.0 | 47560.009382 | 3761.059640 |
13 rows × 4 columns
Rolling/Window#
DataFrame.rolling(window, min_periods=None, center=False, win_type=None, on=None, axis=0, closed=None, step=None, method=’single’)
Shorthand methods:
cumsum()
pct_change()
rolling_df = test_df.pivot(index="datetime", columns="asset", values=test_metrics)
rolling_df
| AdrActCnt | PriceUSD | |||
|---|---|---|---|---|
| asset | btc | eth | btc | eth |
| datetime | ||||
| 2010-07-18 00:00:00+00:00 | 860.0 | NaN | 0.085840 | NaN |
| 2010-07-19 00:00:00+00:00 | 929.0 | NaN | 0.080800 | NaN |
| 2010-07-20 00:00:00+00:00 | 936.0 | NaN | 0.074736 | NaN |
| 2010-07-21 00:00:00+00:00 | 784.0 | NaN | 0.079193 | NaN |
| 2010-07-22 00:00:00+00:00 | 594.0 | NaN | 0.058470 | NaN |
| ... | ... | ... | ... | ... |
| 2022-12-07 00:00:00+00:00 | 899850.0 | 517421.0 | 16848.251824 | 1232.552454 |
| 2022-12-08 00:00:00+00:00 | 905789.0 | 528927.0 | 17231.456322 | 1281.633052 |
| 2022-12-09 00:00:00+00:00 | 946767.0 | 1505652.0 | 17136.149001 | 1263.208619 |
| 2022-12-10 00:00:00+00:00 | 909506.0 | 733795.0 | 17124.214680 | 1266.771442 |
| 2022-12-11 00:00:00+00:00 | 781176.0 | 723883.0 | 17094.360659 | 1263.069667 |
4530 rows × 4 columns
rolling_df.rolling(5, min_periods=3, center=False).mean()
| AdrActCnt | PriceUSD | |||
|---|---|---|---|---|
| asset | btc | eth | btc | eth |
| datetime | ||||
| 2010-07-18 00:00:00+00:00 | NaN | NaN | NaN | NaN |
| 2010-07-19 00:00:00+00:00 | NaN | NaN | NaN | NaN |
| 2010-07-20 00:00:00+00:00 | 908.333333 | NaN | 0.080459 | NaN |
| 2010-07-21 00:00:00+00:00 | 877.250000 | NaN | 0.080142 | NaN |
| 2010-07-22 00:00:00+00:00 | 820.600000 | NaN | 0.075808 | NaN |
| ... | ... | ... | ... | ... |
| 2022-12-07 00:00:00+00:00 | 916719.200000 | 566109.2 | 16980.268361 | 1256.446487 |
| 2022-12-08 00:00:00+00:00 | 910106.200000 | 552708.4 | 17046.658149 | 1264.586847 |
| 2022-12-09 00:00:00+00:00 | 930989.400000 | 727531.4 | 17049.544952 | 1261.221877 |
| 2022-12-10 00:00:00+00:00 | 920778.000000 | 765902.2 | 17082.117531 | 1262.870242 |
| 2022-12-11 00:00:00+00:00 | 888617.600000 | 801935.6 | 17086.886497 | 1261.447047 |
4530 rows × 4 columns
Gaussian Window:
num = 18
std_dev = 4.5
window = scipy.signal.windows.gaussian(num, std=std_dev)
z_score = np.linspace(-num/2/std_dev, num/2/std_dev, num)
plt.plot(z_score, window)
plt.title(r"Gaussian window ($\sigma$=7)")
plt.ylabel("Amplitude")
plt.xlabel("Z-Score")
plt.figure()
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
Binning#
cut#
Bins are of equal size (width), but number of entries per bin may not similar.
# returns pd.Series
pd.cut(test_df['PriceUSD'], bins=5).value_counts()
(-67.491, 13508.392] 6401
(13508.392, 27016.733] 281
(40525.074, 54033.415] 211
(27016.733, 40525.074] 208
(54033.415, 67541.756] 112
Name: PriceUSD, dtype: int64
qcut#
Bins are not of equal size (width), but number of entries per bin are similar.
# returns pd.Series
pd.qcut(test_df['PriceUSD'], q=5).value_counts()
(0.0495, 14.711] 1443
(285.218, 975.619] 1443
(7341.99, 67541.756] 1443
(14.711, 285.218] 1442
(975.619, 7341.99] 1442
Name: PriceUSD, dtype: int64
Strings#
Regex cheat sheet here
Python string formatting cookbook
nba_df = pd.read_csv("C:/Users/wsaye/PycharmProjects/CashAppInterview/data/nba.csv")
nba_df.dropna(inplace=True)
nba_df
| Name | Team | Number | Position | Age | Height | Weight | College | Salary | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Avery Bradley | Boston Celtics | 0.0 | PG | 25.0 | 6-2 | 180.0 | Texas | 7730337.0 |
| 1 | Jae Crowder | Boston Celtics | 99.0 | SF | 25.0 | 6-6 | 235.0 | Marquette | 6796117.0 |
| 3 | R.J. Hunter | Boston Celtics | 28.0 | SG | 22.0 | 6-5 | 185.0 | Georgia State | 1148640.0 |
| 6 | Jordan Mickey | Boston Celtics | 55.0 | PF | 21.0 | 6-8 | 235.0 | LSU | 1170960.0 |
| 7 | Kelly Olynyk | Boston Celtics | 41.0 | C | 25.0 | 7-0 | 238.0 | Gonzaga | 2165160.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 449 | Rodney Hood | Utah Jazz | 5.0 | SG | 23.0 | 6-8 | 206.0 | Duke | 1348440.0 |
| 451 | Chris Johnson | Utah Jazz | 23.0 | SF | 26.0 | 6-6 | 206.0 | Dayton | 981348.0 |
| 452 | Trey Lyles | Utah Jazz | 41.0 | PF | 20.0 | 6-10 | 234.0 | Kentucky | 2239800.0 |
| 453 | Shelvin Mack | Utah Jazz | 8.0 | PG | 26.0 | 6-3 | 203.0 | Butler | 2433333.0 |
| 456 | Jeff Withey | Utah Jazz | 24.0 | C | 26.0 | 7-0 | 231.0 | Kansas | 947276.0 |
364 rows × 9 columns
Search/Replace#
# does not assume regex
# (nba_df["Name"].str.find("a") >= 1).sum()
nba_df["Name"].str.find("J") >= 1
0 False
1 False
3 True
6 False
7 False
...
449 False
451 True
452 False
453 False
456 False
Name: Name, Length: 364, dtype: bool
# assumes regex
# nba_df["Name"].str.contains("^[jJ]")
# nba_df["Name"].str.contains("^(Jo)")
nba_df["Name"].str.extract("^(Jo)")
| 0 | |
|---|---|
| 0 | NaN |
| 1 | NaN |
| 3 | NaN |
| 6 | Jo |
| 7 | NaN |
| ... | ... |
| 449 | NaN |
| 451 | NaN |
| 452 | NaN |
| 453 | NaN |
| 456 | NaN |
364 rows × 1 columns
nba_df["Name"].str.replace("^(Jo)", "PORKY", regex=True)
0 Avery Bradley
1 Jae Crowder
3 R.J. Hunter
6 PORKYrdan Mickey
7 Kelly Olynyk
...
449 Rodney Hood
451 Chris Johnson
452 Trey Lyles
453 Shelvin Mack
456 Jeff Withey
Name: Name, Length: 364, dtype: object
Split, Concat#
# assumes regex if len > 1, else literal
nba_df["Name"].str.split(" ")
0 [Avery, Bradley]
1 [Jae, Crowder]
3 [R.J., Hunter]
6 [Jordan, Mickey]
7 [Kelly, Olynyk]
...
449 [Rodney, Hood]
451 [Chris, Johnson]
452 [Trey, Lyles]
453 [Shelvin, Mack]
456 [Jeff, Withey]
Name: Name, Length: 364, dtype: object
# assumes regex if len > 1, else literal
nba_df["Name"].str.split(" ", n=1, expand=True)
| 0 | 1 | |
|---|---|---|
| 0 | Avery | Bradley |
| 1 | Jae | Crowder |
| 3 | R.J. | Hunter |
| 6 | Jordan | Mickey |
| 7 | Kelly | Olynyk |
| ... | ... | ... |
| 449 | Rodney | Hood |
| 451 | Chris | Johnson |
| 452 | Trey | Lyles |
| 453 | Shelvin | Mack |
| 456 | Jeff | Withey |
364 rows × 2 columns
nba_df["Name"].str.split(" ").str.len().describe()
count 364.000000
mean 2.032967
std 0.220220
min 2.000000
25% 2.000000
50% 2.000000
75% 2.000000
max 5.000000
Name: Name, dtype: float64
nba_df["Name"].str.cat(others=[nba_df["Team"], nba_df["College"]], sep="-")
0 Avery Bradley-Boston Celtics-Texas
1 Jae Crowder-Boston Celtics-Marquette
3 R.J. Hunter-Boston Celtics-Georgia State
6 Jordan Mickey-Boston Celtics-LSU
7 Kelly Olynyk-Boston Celtics-Gonzaga
...
449 Rodney Hood-Utah Jazz-Duke
451 Chris Johnson-Utah Jazz-Dayton
452 Trey Lyles-Utah Jazz-Kentucky
453 Shelvin Mack-Utah Jazz-Butler
456 Jeff Withey-Utah Jazz-Kansas
Name: Name, Length: 364, dtype: object
Formatting#
# nba_df["Name"].str.upper()
nba_df["Name"].str.lower()
0 avery bradley
1 jae crowder
3 r.j. hunter
6 jordan mickey
7 kelly olynyk
...
449 rodney hood
451 chris johnson
452 trey lyles
453 shelvin mack
456 jeff withey
Name: Name, Length: 364, dtype: object
nba_df['Salary_str'] = (nba_df['Salary']/10**6).map('${:,.2f}M'.format) # for single Series
nba_df['Salary_str']
# cohort_crosstab = cohort_crosstab.applymap('{:,.0f}%'.format) # for entire df
0 $7.73M
1 $6.80M
3 $1.15M
6 $1.17M
7 $2.17M
...
449 $1.35M
451 $0.98M
452 $2.24M
453 $2.43M
456 $0.95M
Name: Salary_str, Length: 364, dtype: object
Modeling#
Linear Regression#
lin_reg_df = pd.read_csv('C:/Users/wsaye/PycharmProjects/CashAppInterview/data/lin_reg_test_data.csv') # load data set
X = lin_reg_df.iloc[:, 0].values.reshape(-1, 1) # values converts it into a numpy array
Y = lin_reg_df.iloc[:, 1].values.reshape(-1, 1) # -1 means that calculate the dimension of rows, but have 1 column
COLOR = lin_reg_df.iloc[:, 2].values.reshape(-1, 1)
ohe = OneHotEncoder(sparse_output=False)
ohe_vals = ohe.fit_transform(COLOR)
X_mat = np.concatenate([X, ohe_vals], axis=1)
def regression_results(y_true, y_pred, lin_reg):
# Regression metrics
explained_variance = metrics.explained_variance_score(y_true, y_pred)
mean_absolute_error = metrics.mean_absolute_error(y_true, y_pred)
mse = metrics.mean_squared_error(y_true, y_pred)
mean_squared_log_error = metrics.mean_squared_log_error(y_true, y_pred)
median_absolute_error = metrics.median_absolute_error(y_true, y_pred)
r2 = metrics.r2_score(y_true, y_pred)
print('explained_variance: ', round(explained_variance,4))
print('mean_squared_log_error: ', round(mean_squared_log_error,4))
print('r^2: ', round(r2,4))
print('MAE: ', round(mean_absolute_error,4))
print('MSE: ', round(mse,4))
print('RMSE: ', round(np.sqrt(mse),4))
print(f"Coefficients: {lin_reg.coef_}")
print(f"Intercept: {lin_reg.intercept_}")
Test/Train Split#
X_train, X_test, y_train, y_test = train_test_split(X_mat, Y, test_size = 0.25)
Fit, Predict, Summarize#
regr = LinearRegression()
regr.fit(X_train, y_train)
y_test_pred = regr.predict(X_test)
lin_reg_df["y_pred"] = regr.predict(X_mat)
regression_results(y_test, y_test_pred, regr)
explained_variance: 0.9397
mean_squared_log_error: 0.0248
r^2: 0.9381
MAE: 10.1719
MSE: 151.7468
RMSE: 12.3186
Coefficients: [[ 1.38349831 112.5431928 -125.45870138 12.91550859]]
Intercept: [-8.26839901]
K-Means#
from sklearn.cluster import KMeans
X_ktrain, X_ktest, y_ktrain, y_ktest = train_test_split(X, Y, test_size = 0.25)
train_set = np.concatenate([X_ktrain, y_ktrain], axis=1)
test_set = np.concatenate([X_ktest, y_ktest], axis=1)
kmeans = KMeans(n_clusters=3, random_state=0, n_init="auto").fit(train_set)
train_labels = kmeans.labels_
test_labels = kmeans.predict(test_set)
Plot Model:
fig = make_subplots(rows=1, cols=2)
fig.add_trace(go.Scatter(x=X_ktrain[:, 0], y=y_ktrain[:, 0], mode='markers', name='train',
marker=dict(color=train_labels, line_color="black", line_width=1)), row=1, col=1)
fig.add_trace(go.Scatter(x=X_ktest[:, 0], y=y_ktest[:, 0], mode='markers', name='test',
marker=dict(color=test_labels, line_color="black", line_width=1, symbol="x")), row=1, col=2)
fig.show()
Plotting#
Matplotlib#
# plt.scatter(X, Y)
# plt.plot(X, Y_pred, color="red", linestyle='None', marker="x")
plt.scatter(lin_reg_df["x"], lin_reg_df["y"])
plt.plot(lin_reg_df["x"], lin_reg_df["y_pred"], color="green", linestyle='None', marker="x")
plt.show()
Plotly (Standard)#
fig = go.Figure()
fig.add_trace(go.Scatter(x=X[:, 0], y=Y[:, 0], mode='markers', name='raw data', marker=dict(color="grey")))
# fig.add_trace(go.Scatter(x=X[:, 0], y=Y_pred[:, 0], mode='markers', name='prediction', marker=dict(color=COLOR[:, 0])))
for c in list(np.unique(COLOR[:, 0])):
temp_x = lin_reg_df.loc[lin_reg_df["color"]==c, "x"]
temp_y = lin_reg_df.loc[lin_reg_df["color"]==c, "y_pred"]
fig.add_trace(go.Scatter(x=temp_x, y=temp_y, mode='lines', name='pred-' + c, line_color=c))
fig.show()
Plotly Express#
fig = px.scatter(lin_reg_df, x="x", y="y", color="color")
fig.show()
fig = px.scatter(lin_reg_df, x="x", y="y", color="color",
facet_col="color",
# facet_row="time",
# trendline="ols"
)
fig.show()
Sample Analyses#
Cohort#
rand_gen = np.random.RandomState(2021) # set seed
start_date = dt.datetime.strptime("2022-01-01", "%Y-%m-%d")
end_date = dt.datetime.strptime("2022-01-10", "%Y-%m-%d")
date_rng = pd.date_range(start_date, end_date).values
total_days = len(date_rng)
num_users = 1000
user_df_list = []
for u in range(0, num_users):
num_active_days = rand_gen.randint(low=2, high=total_days)
active_days_index = rand_gen.randint(low=0, high=total_days, size=(1, num_active_days))
active_dates = pd.Series(date_rng[active_days_index[0, :]])
user_id = pd.Series([u]*num_active_days)
user_df = pd.concat([active_dates, user_id], axis=1)
user_df.columns = ["date", "user_id"]
user_df_list.append(user_df)
cohort_df = pd.concat(user_df_list)
cohort_df
| date | user_id | |
|---|---|---|
| 0 | 2022-01-06 | 0 |
| 1 | 2022-01-10 | 0 |
| 2 | 2022-01-01 | 0 |
| 3 | 2022-01-07 | 0 |
| 4 | 2022-01-06 | 0 |
| ... | ... | ... |
| 2 | 2022-01-10 | 999 |
| 3 | 2022-01-06 | 999 |
| 4 | 2022-01-10 | 999 |
| 5 | 2022-01-03 | 999 |
| 6 | 2022-01-10 | 999 |
5443 rows × 2 columns
first_date = cohort_df.groupby(by=["user_id"]).min().rename(columns={"date": "start_date"})
cohort_df = cohort_df.join(first_date, on="user_id", how="left")
cohort_df
| date | user_id | start_date | |
|---|---|---|---|
| 0 | 2022-01-06 | 0 | 2022-01-01 |
| 1 | 2022-01-10 | 0 | 2022-01-01 |
| 2 | 2022-01-01 | 0 | 2022-01-01 |
| 3 | 2022-01-07 | 0 | 2022-01-01 |
| 4 | 2022-01-06 | 0 | 2022-01-01 |
| ... | ... | ... | ... |
| 2 | 2022-01-10 | 999 | 2022-01-03 |
| 3 | 2022-01-06 | 999 | 2022-01-03 |
| 4 | 2022-01-10 | 999 | 2022-01-03 |
| 5 | 2022-01-03 | 999 | 2022-01-03 |
| 6 | 2022-01-10 | 999 | 2022-01-03 |
5443 rows × 3 columns
cohort_crosstab = pd.crosstab(cohort_df['start_date'], cohort_df['date'])
cohort_totals = np.diag(cohort_crosstab).reshape(-1, 1)
cohort_crosstab[cohort_crosstab.columns] = 100 * cohort_crosstab.values / cohort_totals
cohort_crosstab = cohort_crosstab.applymap('{:,.0f}%'.format)
cohort_crosstab
| date | 2022-01-01 | 2022-01-02 | 2022-01-03 | 2022-01-04 | 2022-01-05 | 2022-01-06 | 2022-01-07 | 2022-01-08 | 2022-01-09 | 2022-01-10 |
|---|---|---|---|---|---|---|---|---|---|---|
| start_date | ||||||||||
| 2022-01-01 | 100% | 39% | 45% | 43% | 42% | 45% | 47% | 47% | 34% | 42% |
| 2022-01-02 | 0% | 100% | 44% | 42% | 45% | 41% | 43% | 45% | 38% | 42% |
| 2022-01-03 | 0% | 0% | 100% | 46% | 32% | 39% | 31% | 42% | 37% | 40% |
| 2022-01-04 | 0% | 0% | 0% | 100% | 38% | 43% | 33% | 39% | 35% | 43% |
| 2022-01-05 | 0% | 0% | 0% | 0% | 100% | 42% | 38% | 64% | 38% | 39% |
| 2022-01-06 | 0% | 0% | 0% | 0% | 0% | 100% | 24% | 37% | 44% | 44% |
| 2022-01-07 | 0% | 0% | 0% | 0% | 0% | 0% | 100% | 47% | 63% | 42% |
| 2022-01-08 | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 100% | 25% | 31% |
| 2022-01-09 | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 100% | 167% |
| 2022-01-10 | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 100% |
Funnel#
stages = ["Website visit", "Downloads", "Potential customers", "Requested price", "Invoice sent"]
df_mtl = pd.DataFrame(dict(number=[39, 27.4, 20.6, 11, 3], stage=stages))
df_mtl['office'] = 'Montreal'
df_toronto = pd.DataFrame(dict(number=[52, 36, 18, 14, 5], stage=stages))
df_toronto['office'] = 'Toronto'
df = pd.concat([df_mtl, df_toronto], axis=0)
fig = px.funnel(df, x='number', y='stage', color='office')
display(fig)
trace0 = go.Funnel(
y = stages,
x = [49, 29, 26, 11, 2],
textinfo = "value+percent initial")
# Fill out data with our traces
traces = [trace0]
# Plot it and save as basic-line.html
pyo.iplot(traces, filename = 'funnel_2')